The data contains features extracted from the silhouette of vehicles in different angles. Four "Corgie" model vehicles were used for the experiment: a double decker bus, Cheverolet van, Saab 9000 and an Opel Manta 400 cars. This particular combination of vehicles was chosen with the expectation that the bus, van and either one of the cars would be readily distinguishable, but it would be more difficult to distinguish between the cars.
Apply dimensionality reduction technique – PCA and train a model using principle components instead of training the model using just the raw data.
#import the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn import svm
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report,roc_auc_score
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
cData=pd.read_csv('vehicle-1.csv')
cData.head()
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
le = LabelEncoder()
columns = cData.columns
#Let's Label Encode our class variable:
print(columns)
cData['class'] = le.fit_transform(cData['class'])
cData.shape
cData.info()
On performing the check it is found that
Circularity,class,hollow_ratio,max.length_rectangularity,max.length_aspect_ratio,compactness has no missing values rest all features are having some kind of missing values.
All attributes are of numerical type.
from sklearn.impute import SimpleImputer
newdf = cData.copy()
X = newdf.iloc[:,0:19] #separting all numercial independent attribute
imputer = SimpleImputer(missing_values=np.nan, strategy='median', verbose=1)
#fill missing values with mean column values
transformed_values = imputer.fit_transform(X)
column = X.columns
print(column)
newdf = pd.DataFrame(transformed_values, columns = column )
newdf.describe()
print("Original null value count:", cData.isnull().sum())
print("\n\nCount after we imputed the NaN value: ", newdf.isnull().sum())
If you carefully observe above, our original dataframe cData and new dataframe newdf ,
we will find that after we imputed the dataframe series using simple imputer,
we can see that the missing NaN values from our orginal datframe columns are treated and replaced using median strategy.
newdf.describe().T
newdf.shape
plt.style.use('seaborn-whitegrid')
newdf.hist(bins=20, figsize=(60,40), color='lightblue', edgecolor = 'red')
plt.show()
Most of the data attributes seems to be normally distributed
Scaled variance 1 and skewness about 1 and 2, scatter_ratio, seems to be right skewed .
Pr.axis_rectangularity seems to have outliers as there are some gaps found in the bar plot.
#We use the seaborn distplot to analyze the distribution of the columns and see the skewness in attributes.
f, ax = plt.subplots(1, 6, figsize=(30,5))
vis1 = sns.distplot(newdf["scaled_variance.1"],bins=10, ax= ax[0])
vis2 = sns.distplot(newdf["scaled_variance"],bins=10, ax=ax[1])
vis3 = sns.distplot(newdf["skewness_about.1"],bins=10, ax= ax[2])
vis4 = sns.distplot(newdf["skewness_about"],bins=10, ax=ax[3])
vis6 = sns.distplot(newdf["scatter_ratio"],bins=10, ax=ax[5])
f.savefig('subplot.png')
skewValue = newdf.skew()
print("skewValue of dataframe attributes: ", skewValue)
#Summary View of all attributes
ax = sns.boxplot(data=newdf, orient="h")
plt.figure(figsize= (20,15))
plt.subplot(3,3,1)
sns.boxplot(x= newdf['pr.axis_aspect_ratio'], color='red')
plt.subplot(3,3,2)
sns.boxplot(x= newdf.skewness_about, color='blue')
plt.subplot(3,3,3)
sns.boxplot(x= newdf.scaled_variance, color='green')
plt.show()
plt.figure(figsize= (20,15))
plt.subplot(3,3,1)
sns.boxplot(x= newdf['radius_ratio'], color='red')
plt.subplot(3,3,2)
sns.boxplot(x= newdf['scaled_radius_of_gyration.1'], color='lightblue')
plt.subplot(3,3,3)
sns.boxplot(x= newdf['scaled_variance.1'], color='yellow')
plt.show()
plt.figure(figsize= (20,15))
plt.subplot(3,3,1)
sns.boxplot(x= newdf['max.length_aspect_ratio'], color='green')
plt.subplot(3,3,2)
sns.boxplot(x= newdf['skewness_about.1'], color='grey')
plt.show()
Pr.axis_aspect_ratio, skewness_about, max_length_aspect_ratio, skewness_about_1,
Scaled_radius_of_gyration.1, scaled_variance.1, radius_ratio, skewness_about, scaled_variance.1 are
some of the attributes with outliers which is visible with all dotted points
from scipy.stats import iqr
Q1 = newdf.quantile(0.25)
Q3 = newdf.quantile(0.75)
IQR = Q3 - Q1
print(IQR)
cleandf = newdf[~((newdf < (Q1 - 1.5 * IQR)) |(newdf > (Q3 + 1.5 * IQR))).any(axis=1)]
cleandf.shape
plt.figure(figsize= (20,15))
plt.subplot(8,8,1)
sns.boxplot(x= cleandf['pr.axis_aspect_ratio'], color='orange')
plt.subplot(8,8,2)
sns.boxplot(x= cleandf.skewness_about, color='purple')
plt.subplot(8,8,3)
sns.boxplot(x= cleandf.scaled_variance, color='brown')
plt.subplot(8,8,4)
sns.boxplot(x= cleandf['radius_ratio'], color='red')
plt.subplot(8,8,5)
sns.boxplot(x= cleandf['scaled_radius_of_gyration.1'], color='lightblue')
plt.subplot(8,8,6)
sns.boxplot(x= cleandf['scaled_variance.1'], color='yellow')
plt.subplot(8,8,7)
sns.boxplot(x= cleandf['max.length_aspect_ratio'], color='lightblue')
plt.subplot(8,8,8)
sns.boxplot(x= cleandf['skewness_about.1'], color='pink')
plt.show()
We can see that all the boxplots for the attributes which had outlier have been treated and removed.
Since no. of outliers were less we opted to remove it.
Generally we avoid this as it can lead to info loss in case of large data sets with large no of outliers
def correlation_heatmap(dataframe,l,w):
correlation = dataframe.corr()
plt.figure(figsize=(l,w))
sns.heatmap(correlation, vmax=1, square=True,annot=True,cmap='YlGnBu')
plt.title('Correlation between different fearures')
plt.show();
# Let's Drop Class column and see the correlation Matrix & Pairplot Before using this dataframe for PCA as PCA should only be performed on independent attribute
cleandf= newdf.drop('class', axis=1)
#print("After Dropping: ", cleandf)
correlation_heatmap(cleandf, 30,15)
Scaled Variance & Scaled Variance.1 seems to be strongly correlated with value of 0.98.
Skewness_about_2 and hollow_ratio seems to be strongly correlated with value of: 0.89.
Distance_circularity and radius_ratio seems to have high positive correlation with value of 0.81.
Compactness & circularity,radius_ratio & pr.axis_aspect_ratio also seems to be
averagely correlated with value of 0.67.
Scaled _variance & scaled_radius_of_gyration, circularity & distance_circularity also seems to
be highly correlated with value of 0.79.
Pr.axis_recatngularity and max.length_recatngularity also seems to be strongly correlated with value of 0.81.
Scatter_ratio and elongatedness seems to be have strong negative correlation val with value of 0.97.
Elongatedness and pr.axis_rectangularity seems to have strong negative correlation with value of 0.95.
Max_length_aspect_ratio & radius_ratio have average correlation with coeff: 0.5.
Pr.axis_aspect_ratio & max_length_aspect_ratio seems to have very little correlation.
Scaled_radius_gyration & scaled_radisu_gyration.1 seems to be very little correlated.
Scaled_radius_gyration.1 & skewness_about seems to be very little correlated.
Skewness_about & skewness_about.1 not be correlated.
Skewness_about.1 and skewness_about.2 are not correlated.
sns.pairplot(cleandf, diag_kind="kde")
From above correlation matrix we can see that there are many features which are highly correlated.
If we carefully analyse, we will find that many features having more than 0.9 correlation.
So we can get rid of those columns whose correlation is +-0.9 or above.
There are 8 such columns:
Max.length_rectangularity
Scaled_radius_of_gyration
Skewness_about.2
Scatter_ratio
Elongatedness
Pr.axis_rectangularity
Scaled_variance
Scaled_variance.1
#display how many are car,bus,van.
newdf['class'].value_counts()
splitscaledf = newdf.copy()
sns.countplot(newdf['class'])
plt.show()
splitscaledf.head(850)
splitscale_X = splitscaledf.iloc[:,0:18].values
splitscale_y = splitscaledf.iloc[:,18].values
print("Indpendent Variable X",splitscale_X )
print("Class Variable y",splitscale_y )
#splitting the data in train and test sets into 70:30 Ratio
SplitScale_X_train, SplitScale_X_test, SplitScale_y_train, SplitScale_y_test = train_test_split(splitscale_X,splitscale_y, test_size = 0.3, random_state = 10)
from sklearn.preprocessing import StandardScaler
# We transform (centralize) the entire X (independent variable data) to normalize it using standardscalar through transformation.
# We will create the PCA dimensions on this distribution.
sc = StandardScaler()
X_std = sc.fit_transform(X)
ssx_train_sd = StandardScaler().fit_transform(SplitScale_X_train)
ssx_test_sd = StandardScaler().fit_transform(SplitScale_X_test)
print(len(ssx_train_sd))
print(len(ssx_test_sd))
# generating the covariance matrix and the eigen values for the PCA analysis
cov_matrix_1 = np.cov(ssx_train_sd.T) # the relevant covariance matrix
print('Covariance Matrix \n%s', (cov_matrix_1))
#generating the eigen values and the eigen vectors
e_vals, e_vecs = np.linalg.eig(cov_matrix_1)
print('Eigenvectors \n%s' %(e_vecs))
print('\nEigenvalues \n%s' %e_vals)
eigenvalues, eigenvectors = np.linalg.eig(cov_matrix_1)
print('Eigen Vectors \n%s', eigenvectors)
print('\n Eigen Values \n%s', eigenvalues)
# Make a set of (eigenvalue, eigenvector) pairs
eig_pairs = [(eigenvalues[index], eigenvectors[:,index]) for index in range(len(eigenvalues))]
# Sort the (eigenvalue, eigenvector) pairs from highest to lowest with respect to eigenvalue
eig_pairs.sort()
eig_pairs.reverse()
print(eig_pairs)
# Extract the descending ordered eigenvalues and eigenvectors
eigvalues_sorted = [eig_pairs[index][0] for index in range(len(eigenvalues))]
eigvectors_sorted = [eig_pairs[index][1] for index in range(len(eigenvalues))]
# Let's confirm our sorting worked, print out eigenvalues
print('Eigenvalues in descending order: \n%s' %eigvalues_sorted)
tot = sum(eigenvalues)
var_explained = [(i / tot) for i in sorted(eigenvalues, reverse=True)] # an array of variance explained by each
# eigen vector... there will be 8 entries as there are 8 eigen vectors)
cum_var_exp = np.cumsum(var_explained) # an array of cumulative variance. There will be 8 entries with 8 th entry
# cumulative reaching almost 100%
plt.bar(range(1,19), var_explained, alpha=0.5, align='center', label='individual explained variance')
plt.step(range(1,19),cum_var_exp, where= 'mid', label='cumulative explained variance')
plt.ylabel('Explained variance ratio')
plt.xlabel('Principal components')
plt.legend(loc = 'best')
plt.show()
# P_reduce represents reduced mathematical space....
P_reduce_1 = np.array(eigvectors_sorted[0:8])
X_train_std_pca = np.dot(ssx_train_sd,P_reduce_1.T)
X_test_std_pca = np.dot(ssx_test_sd,P_reduce_1.T)
print(X_train_std_pca)
print(X_test_std_pca)
Projected_df_train = pd.DataFrame(X_train_std_pca)
Projected_df_test = pd.DataFrame(X_test_std_pca)
sns.pairplot(Projected_df_train, diag_kind='kde')
sns.pairplot(Projected_df_test, diag_kind='kde')
We can see that after we performed PCA our new dataframe with reduced dimesnions having no to zero linear relationship
among themseleves, which is the main objective of using PCA tool.
Almost all attribures have cloud of data in the mathematical space with no clear positive or negative correlation.
ssx_train_sd.shape, P_reduce_1.T.shape, X_train_std_pca.shape, X_test_std_pca.shape
clf1 = SVC()
clf1.fit(ssx_train_sd, SplitScale_y_train)
print ('Before PCA score', clf1.score(ssx_test_sd, SplitScale_y_test))
clf2 = SVC()
clf2.fit(X_train_std_pca, SplitScale_y_train)
print ('After PCA score', clf2.score(X_test_std_pca, SplitScale_y_test))
#predict the y value
pca_yhat_predict= clf2.predict(X_test_std_pca)
#orginal data y value
orig_yhat_predict = clf1.predict(ssx_test_sd)
print("Before PCA On Original 18 Dimension",accuracy_score(SplitScale_y_test,orig_yhat_predict))
print("After PCA(On 8 dimension)",accuracy_score(SplitScale_y_test,pca_yhat_predict))
def draw_confmatrix(y_test, yhat, str1, str2, str3, datatype ):
#Make predictions and evalute
#model_pred = fit_test_model(model,X_train, y_train, X_test)
cm = confusion_matrix( y_test, yhat, [0,1,2] )
print("Confusion Matrix For :", "\n",datatype,cm )
sns.heatmap(cm, annot=True, fmt='.2f', xticklabels = [str1, str2,str3] , yticklabels = [str1, str2,str3] )
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
draw_confmatrix(SplitScale_y_test, orig_yhat_predict,"Van ", "Car ", "Bus", "Original Data Set" )
draw_confmatrix(SplitScale_y_test, pca_yhat_predict,"Van ", "Car ", "Bus", "For Reduced Dimensions Using PCA ")
#Classification Report Of Model built on Raw Data
print("Classification Report For Raw Data:", "\n", classification_report(SplitScale_y_test,orig_yhat_predict))
#Classification Report Of Model built on Principal Components:
print("Classification Report For PCA:","\n", classification_report(SplitScale_y_test,pca_yhat_predict))
On training data set we saw that our support vector classifier without performing PCA has an accuracy score of 95 %.
But when we applied the SVC model on PCA componenets(reduced dimensions) our model still performed better with 95 % with only miniscule fall in score.